library(GEOquery)
Loading required package: Biobase
Loading required package: BiocGenerics
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename,
cbind, colnames, dirname, do.call, duplicated, eval,
evalq, Filter, Find, get, grep, grepl, intersect,
is.unsorted, lapply, Map, mapply, match, mget,
order, paste, pmax, pmax.int, pmin, pmin.int,
Position, rank, rbind, Reduce, rownames, sapply,
setdiff, sort, table, tapply, union, unique,
unsplit, which.max, which.min
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages
'citation("pkgname")'.
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Setting options('download.file.method.GEOquery'='auto')
Setting options('GEOquery.inmemory.gpl'=FALSE)
library(oligo)
Loading required package: oligoClasses
Welcome to oligoClasses version 1.56.0
Loading required package: Biostrings
Loading required package: S4Vectors
Loading required package: stats4
Attaching package: ‘S4Vectors’
The following objects are masked from ‘package:base’:
expand.grid, I, unname
Loading required package: IRanges
Loading required package: XVector
Loading required package: GenomeInfoDb
Attaching package: ‘Biostrings’
The following object is masked from ‘package:base’:
strsplit
================================================================
Welcome to oligo version 1.58.0
================================================================
library(sva)
Loading required package: mgcv
Loading required package: nlme
Attaching package: ‘nlme’
The following object is masked from ‘package:Biostrings’:
collapse
The following object is masked from ‘package:IRanges’:
collapse
This is mgcv 1.8-38. For overview type 'help("mgcv-package")'.
Loading required package: genefilter
Loading required package: BiocParallel
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ─────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5 ✓ purrr 0.3.4
✓ tibble 3.1.6 ✓ dplyr 1.0.7
✓ tidyr 1.1.4 ✓ stringr 1.4.0
✓ readr 2.1.0 ✓ forcats 0.5.1
── Conflicts ────────────────────────── tidyverse_conflicts() ──
x dplyr::collapse() masks nlme::collapse(), Biostrings::collapse(), IRanges::collapse()
x dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
x purrr::compact() masks XVector::compact()
x dplyr::desc() masks IRanges::desc()
x tidyr::expand() masks S4Vectors::expand()
x dplyr::filter() masks stats::filter()
x dplyr::first() masks S4Vectors::first()
x dplyr::lag() masks stats::lag()
x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
x purrr::reduce() masks IRanges::reduce()
x dplyr::rename() masks S4Vectors::rename()
x dplyr::slice() masks XVector::slice(), IRanges::slice()
x readr::spec() masks genefilter::spec()
x dplyr::summarize() masks oligo::summarize()
library(ggsci)
library(factoextra)
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(pheatmap)
library(dendextend)
---------------------
Welcome to dendextend version 1.15.2
Type citation('dendextend') for how to cite the package.
Type browseVignettes(package = 'dendextend') for the package vignette.
The github page is: https://github.com/talgalili/dendextend/
Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
You may ask questions at stackoverflow, use the r and dendextend tags:
https://stackoverflow.com/questions/tagged/dendextend
To suppress this message use: suppressPackageStartupMessages(library(dendextend))
---------------------
Attaching package: ‘dendextend’
The following object is masked from ‘package:Biostrings’:
nnodes
The following object is masked from ‘package:stats’:
cutree
library(caret)
Loading required package: lattice
Attaching package: ‘caret’
The following object is masked from ‘package:purrr’:
lift
library(RColorBrewer)
library(viridis)
Loading required package: viridisLite
library(UpSetR)
Attaching package: ‘UpSetR’
The following object is masked from ‘package:lattice’:
histogram
library(ComplexUpset)
Attaching package: ‘ComplexUpset’
The following object is masked from ‘package:UpSetR’:
upset
# given a matrix, perform min-max scaling on its columns
min_max_mat <- function(mat){
mat_rescaled <- apply(mat, 2, function(v){
v_range <- range(v)
names(v_range) <- c("minimum", "maximum")
range_difference <- v_range["maximum"] - v_range["minimum"]
rescaled <- (v - v_range["minimum"])/range_difference
return(rescaled)
})
return(mat_rescaled)
}
# geodata <- GEOquery::getGEO(GEO = "GSE76275", destdir = "./tempfiles")
# geodata <- GEOquery::getGEO(filename = "./tempfiles/GSE76275_series_matrix.txt.gz")
# saveRDS(geodata, "geodata.RDS")
geodata <- readRDS("geodata.RDS")
# mdata <- geodata %>%
# pluck(1) %>%
# phenoData() %>%
# pData() %>% as_tibble()
# feature_data <- geodata %>%
# pluck(1) %>%
# featureData()
# write_csv(mdata, "raw_mdata.csv")
mdata <- read_csv("raw_mdata.csv")
Rows: 265 Columns: 69
── Column specification ────────────────────────────────────────
Delimiter: ","
chr (62): title, geo_accession, status, submission_date, las...
dbl (6): channel_count, taxid_ch1, contact_zip/postal_code,...
lgl (1): growth_protocol_ch1
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# saveRDS(feature_data, "featureData.RDS")
# feature_data <- readRDS("featureData.RDS")
mdata %>%
glimpse()
Rows: 265
Columns: 23
$ title <chr> "S1-H10", "S1-H14…
$ submission_date <chr> "Dec 17 2015", "D…
$ last_update_date <chr> "Dec 18 2015", "D…
$ geo_accession <chr> "GSM1974566", "GS…
$ `age (years):ch1` <dbl> NA, 41, 55, 55, 6…
$ `ajcc stage (7th edition, 2010):ch1` <chr> "T2N1M0", "T1N0M0…
$ `body mass index:ch1` <dbl> 32, 29, NA, 31, 3…
$ `er:ch1` <chr> "Negative", "Nega…
$ `gender:ch1` <chr> "Female", "Female…
$ `her2:ch1` <chr> "Negative", "Nega…
$ `histology group:ch1` <chr> "Infiltrating Duc…
$ `histology:ch1` <chr> "Infiltrating Duc…
$ `menopausal status:ch1` <chr> "Post-Menopausal"…
$ `metastases:ch1` <chr> "No mets", "No me…
$ `positive nodes:ch1` <chr> "1 - 3", "0", "0"…
$ `pr:ch1` <chr> "Negative", "Nega…
$ `race:ch1` <chr> "Caucasian", "Cau…
$ `set:ch1` <chr> "Validation TN", …
$ `tissue:ch1` <chr> "Breast cancer", …
$ `tnbc subtype:ch1` <chr> "Mesenchymal (MES…
$ `triple-negative status:ch1` <chr> "TN", "TN", "TN",…
$ `tumor grade:ch1` <chr> NA, "Poorly Diffe…
$ `tumor size:ch1` <chr> "2 - 5 cm", "<=2c…
mdata <- mdata %>%
select(title, contains("date"), geo_accession, contains(":ch1"))
colnames(mdata)
[1] "title"
[2] "submission_date"
[3] "last_update_date"
[4] "geo_accession"
[5] "age (years):ch1"
[6] "ajcc stage (7th edition, 2010):ch1"
[7] "body mass index:ch1"
[8] "er:ch1"
[9] "gender:ch1"
[10] "her2:ch1"
[11] "histology group:ch1"
[12] "histology:ch1"
[13] "menopausal status:ch1"
[14] "metastases:ch1"
[15] "positive nodes:ch1"
[16] "pr:ch1"
[17] "race:ch1"
[18] "set:ch1"
[19] "tissue:ch1"
[20] "tnbc subtype:ch1"
[21] "triple-negative status:ch1"
[22] "tumor grade:ch1"
[23] "tumor size:ch1"
cnames <- colnames(mdata)
cnames_processed <- str_split(cnames, pattern = ":") %>%
map_chr(~{.x[[1]]}) %>%
str_replace_all(" ", "_") %>%
str_replace_all("-", "_") %>%
str_remove_all("\\(|\\)|,")
cnames_processed
[1] "title" "submission_date"
[3] "last_update_date" "geo_accession"
[5] "age_years" "ajcc_stage_7th_edition_2010"
[7] "body_mass_index" "er"
[9] "gender" "her2"
[11] "histology_group" "histology"
[13] "menopausal_status" "metastases"
[15] "positive_nodes" "pr"
[17] "race" "set"
[19] "tissue" "tnbc_subtype"
[21] "triple_negative_status" "tumor_grade"
[23] "tumor_size"
colnames(mdata) <- cnames_processed
rm(cnames, cnames_processed)
glimpse(mdata)
Rows: 265
Columns: 23
$ title <chr> "S1-H10", "S1-H14", "S1-H1…
$ submission_date <chr> "Dec 17 2015", "Dec 17 201…
$ last_update_date <chr> "Dec 18 2015", "Dec 18 201…
$ geo_accession <chr> "GSM1974566", "GSM1974567"…
$ age_years <dbl> NA, 41, 55, 55, 65, 40, 66…
$ ajcc_stage_7th_edition_2010 <chr> "T2N1M0", "T1N0M0", "T2N0M…
$ body_mass_index <dbl> 32, 29, NA, 31, 38, 22, 22…
$ er <chr> "Negative", "Negative", "N…
$ gender <chr> "Female", "Female", "Femal…
$ her2 <chr> "Negative", "Negative", "N…
$ histology_group <chr> "Infiltrating Ductal Carci…
$ histology <chr> "Infiltrating Ductal Carci…
$ menopausal_status <chr> "Post-Menopausal", "Post-M…
$ metastases <chr> "No mets", "No mets", "No …
$ positive_nodes <chr> "1 - 3", "0", "0", "0", "4…
$ pr <chr> "Negative", "Negative", "N…
$ race <chr> "Caucasian", "Caucasian", …
$ set <chr> "Validation TN", "Validati…
$ tissue <chr> "Breast cancer", "Breast c…
$ tnbc_subtype <chr> "Mesenchymal (MES)", "Basa…
$ triple_negative_status <chr> "TN", "TN", "TN", "TN", "T…
$ tumor_grade <chr> NA, "Poorly Differentiated…
$ tumor_size <chr> "2 - 5 cm", "<=2cm", "2 - …
mdata <- mdata %>%
mutate(her2 = if_else(!is.na(her2), her2, "Not Available")) %>%
mutate(er = factor(er, levels = c("Negative", "Positive")),
pr = factor(pr, levels = c("Negative", "Positive")),
her2 = factor(her2, levels = c("Negative", "Positive", "Not Available"))) %>%
select(geo_accession, everything())
head(mdata)
Reordering the columns in the metadata.
mdata <- select(mdata, geo_accession, everything())
head(mdata)
Celfiles downloaded from GEO and kept the folder celfiles/
celFiles <- list.celfiles('celfiles/', full.names = TRUE, listGzipped = TRUE)
celFiles %>% head()
[1] "celfiles//GSM1974566_S1_H10.CEL.gz"
[2] "celfiles//GSM1974567_S1_H14.CEL.gz"
[3] "celfiles//GSM1974568_S1_H19.CEL.gz"
[4] "celfiles//GSM1974569_S1_H20B.CEL.gz"
[5] "celfiles//GSM1974570_S1_H22.CEL.gz"
[6] "celfiles//GSM1974571_S1_H27.CEL.gz"
names(celFiles) <- celFiles %>%
basename() %>%
str_split("\\.") %>%
map_chr(~{.x[1]}) %>%
str_split("_") %>%
map_chr(~{.x[1]})
head(celFiles)
GSM1974566
"celfiles//GSM1974566_S1_H10.CEL.gz"
GSM1974567
"celfiles//GSM1974567_S1_H14.CEL.gz"
GSM1974568
"celfiles//GSM1974568_S1_H19.CEL.gz"
GSM1974569
"celfiles//GSM1974569_S1_H20B.CEL.gz"
GSM1974570
"celfiles//GSM1974570_S1_H22.CEL.gz"
GSM1974571
"celfiles//GSM1974571_S1_H27.CEL.gz"
Rearranging rows of metadata to match order of samples in celFiles.
mdata <- mdata[match(mdata$geo_accession, names(celFiles)), ]
Getting only the relevant variables from the metadata.
mdata_subset <- mdata %>%
select(geo_accession,
title,
triple_negative_status,
tnbc_subtype,
submission_date,
er,
her2,
pr,
race,
set,
gender,
age_years) %>%
mutate(across(where(is.character), .fns = factor)) %>%
mutate(tnbc_subtype = if_else(is.na(as.character(tnbc_subtype)), "Not Applicable", as.character(tnbc_subtype))) %>%
mutate(tnbc_subtype = factor(tnbc_subtype)) %>%
as.data.frame()
rownames(mdata_subset) <- as.character(mdata_subset$geo_accession)
head(mdata_subset)
NA
rawData <- read.celfiles(celFiles, phenoData = AnnotatedDataFrame(mdata_subset))
Platform design info loaded.
Reading in : celfiles//GSM1974566_S1_H10.CEL.gz
Reading in : celfiles//GSM1974567_S1_H14.CEL.gz
Reading in : celfiles//GSM1974568_S1_H19.CEL.gz
Reading in : celfiles//GSM1974569_S1_H20B.CEL.gz
Reading in : celfiles//GSM1974570_S1_H22.CEL.gz
Reading in : celfiles//GSM1974571_S1_H27.CEL.gz
Reading in : celfiles//GSM1974572_S1_H28.CEL.gz
Reading in : celfiles//GSM1974573_S1_H29.CEL.gz
Reading in : celfiles//GSM1974574_S1_H2B.CEL.gz
Reading in : celfiles//GSM1974575_S1_H31.CEL.gz
Reading in : celfiles//GSM1974576_S1_H35B.CEL.gz
Reading in : celfiles//GSM1974577_S1_H36.CEL.gz
Reading in : celfiles//GSM1974578_S1_H38.CEL.gz
Reading in : celfiles//GSM1974579_S1_H3B.CEL.gz
Reading in : celfiles//GSM1974580_S1_H40.CEL.gz
Reading in : celfiles//GSM1974581_S1_H41.CEL.gz
Reading in : celfiles//GSM1974582_S2_H43.CEL.gz
Reading in : celfiles//GSM1974583_S2_H44.CEL.gz
Reading in : celfiles//GSM1974584_S2_H45.CEL.gz
Reading in : celfiles//GSM1974585_S2_H46.CEL.gz
Reading in : celfiles//GSM1974586_S2_H47.CEL.gz
Reading in : celfiles//GSM1974587_S2_H48.CEL.gz
Reading in : celfiles//GSM1974588_S2_H49.CEL.gz
Reading in : celfiles//GSM1974589_S1_H4B.CEL.gz
Reading in : celfiles//GSM1974590_S2_H50.CEL.gz
Reading in : celfiles//GSM1974591_S2_H51.CEL.gz
Reading in : celfiles//GSM1974592_S2_H52.CEL.gz
Reading in : celfiles//GSM1974593_S2_H53.CEL.gz
Reading in : celfiles//GSM1974594_S2_H58B.CEL.gz
Reading in : celfiles//GSM1974595_S2_H59.CEL.gz
Reading in : celfiles//GSM1974596_S1_H6.CEL.gz
Reading in : celfiles//GSM1974597_S2_H60.CEL.gz
Reading in : celfiles//GSM1974598_S2_H61.CEL.gz
Reading in : celfiles//GSM1974599_S2_H62.CEL.gz
Reading in : celfiles//GSM1974600_S2_H63.CEL.gz
Reading in : celfiles//GSM1974601_S2_H64.CEL.gz
Reading in : celfiles//GSM1974602_S2_H65.CEL.gz
Reading in : celfiles//GSM1974603_S2_H66.CEL.gz
Reading in : celfiles//GSM1974604_S2_H67.CEL.gz
Reading in : celfiles//GSM1974605_S2_H68.CEL.gz
Reading in : celfiles//GSM1974606_S2_H69.CEL.gz
Reading in : celfiles//GSM1974607_S1_H7.CEL.gz
Reading in : celfiles//GSM1974608_S2_H70.CEL.gz
Reading in : celfiles//GSM1974609_S2_H71.CEL.gz
Reading in : celfiles//GSM1974610_S2_H72B.CEL.gz
Reading in : celfiles//GSM1974611_S2_H73.CEL.gz
Reading in : celfiles//GSM1974612_S1_H8.CEL.gz
Reading in : celfiles//GSM1974613_S1_H9.CEL.gz
Reading in : celfiles//GSM1974614_S2_H54B.CEL.gz
Reading in : celfiles//GSM1974615_S2_H55B.CEL.gz
Reading in : celfiles//GSM1974616_S2_H56B.CEL.gz
Reading in : celfiles//GSM1974617_S2_H57C.CEL.gz
Reading in : celfiles//GSM1974618_S2_H76.CEL.gz
Reading in : celfiles//GSM1974619_S2_H77.CEL.gz
Reading in : celfiles//GSM1974620_S2_H78.CEL.gz
Reading in : celfiles//GSM1974621_S2_H79.CEL.gz
Reading in : celfiles//GSM1974622_S2_H80.CEL.gz
Reading in : celfiles//GSM1974623_S2_H81.CEL.gz
Reading in : celfiles//GSM1974624_S2_H82.CEL.gz
Reading in : celfiles//GSM1974625_S2_H83.CEL.gz
Reading in : celfiles//GSM1974626_S2_H84.CEL.gz
Reading in : celfiles//GSM1974627_S2_H85B.CEL.gz
Reading in : celfiles//GSM1974628_S2_H88.CEL.gz
Reading in : celfiles//GSM1974629_S2_H89.CEL.gz
Reading in : celfiles//GSM1974630_S2_H90.CEL.gz
Reading in : celfiles//GSM1974631_S2_H91B.CEL.gz
Reading in : celfiles//GSM1974632_S3_H100C.CEL.gz
Reading in : celfiles//GSM1974633_S3_H102C.CEL.gz
Reading in : celfiles//GSM1974634_S3_H103C.CEL.gz
Reading in : celfiles//GSM1974635_S3_H104C.CEL.gz
Reading in : celfiles//GSM1974636_S3_H105C.CEL.gz
Reading in : celfiles//GSM1974637_S3_H106C.CEL.gz
Reading in : celfiles//GSM1974638_S3_H107C.CEL.gz
Reading in : celfiles//GSM1974639_S3_H108C.CEL.gz
Reading in : celfiles//GSM1974640_S3_H109C.CEL.gz
Reading in : celfiles//GSM1974641_S3_H110C.CEL.gz
Reading in : celfiles//GSM1974642_S3_H111C.CEL.gz
Reading in : celfiles//GSM1974643_S3_H113C.CEL.gz
Reading in : celfiles//GSM1974644_S3_H114C.CEL.gz
Reading in : celfiles//GSM1974645_S3_H115.CEL.gz
Reading in : celfiles//GSM1974646_S3_H116C.CEL.gz
Reading in : celfiles//GSM1974647_S3_H117C.CEL.gz
Reading in : celfiles//GSM1974648_S3_H118C.CEL.gz
Reading in : celfiles//GSM1974649_S3_H119C.CEL.gz
Reading in : celfiles//GSM1974650_S3_H120C.CEL.gz
Reading in : celfiles//GSM1974651_S3_H121C.CEL.gz
Reading in : celfiles//GSM1974652_S3_H122C.CEL.gz
Reading in : celfiles//GSM1974653_S3_H123C.CEL.gz
Reading in : celfiles//GSM1974654_S3_H124C.CEL.gz
Reading in : celfiles//GSM1974655_S3_H125C.CEL.gz
Reading in : celfiles//GSM1974656_S3_H126C.CEL.gz
Reading in : celfiles//GSM1974657_S3_H127C.CEL.gz
Reading in : celfiles//GSM1974658_S3_H128C.CEL.gz
Reading in : celfiles//GSM1974659_S3_H129C.CEL.gz
Reading in : celfiles//GSM1974660_S3_H130.CEL.gz
Reading in : celfiles//GSM1974661_S3_H131.CEL.gz
Reading in : celfiles//GSM1974662_S3_H132.CEL.gz
Reading in : celfiles//GSM1974663_S3_H133D.CEL.gz
Reading in : celfiles//GSM1974664_S3_H134.CEL.gz
Reading in : celfiles//GSM1974665_S3_H135B.CEL.gz
Reading in : celfiles//GSM1974666_S3_H136B.CEL.gz
Reading in : celfiles//GSM1974667_S3_H137B.CEL.gz
Reading in : celfiles//GSM1974668_S3_H138.CEL.gz
Reading in : celfiles//GSM1974669_S3_H139.CEL.gz
Reading in : celfiles//GSM1974670_S3_H140.CEL.gz
Reading in : celfiles//GSM1974671_S3_H141.CEL.gz
Reading in : celfiles//GSM1974672_S3_H142.CEL.gz
Reading in : celfiles//GSM1974673_S3_H143.CEL.gz
Reading in : celfiles//GSM1974674_S3_H144.CEL.gz
Reading in : celfiles//GSM1974675_S3_H145.CEL.gz
Reading in : celfiles//GSM1974676_S3_H146.CEL.gz
Reading in : celfiles//GSM1974677_S3_H147.CEL.gz
Reading in : celfiles//GSM1974678_S3_H148.CEL.gz
Reading in : celfiles//GSM1974679_S3_H149.CEL.gz
Reading in : celfiles//GSM1974680_S3_H150.CEL.gz
Reading in : celfiles//GSM1974681_S3_H151.CEL.gz
Reading in : celfiles//GSM1974682_S3_H152.CEL.gz
Reading in : celfiles//GSM1974683_S3_H153.CEL.gz
Reading in : celfiles//GSM1974684_S3_H154.CEL.gz
Reading in : celfiles//GSM1974685_S3_H155.CEL.gz
Reading in : celfiles//GSM1974686_S3_H156.CEL.gz
Reading in : celfiles//GSM1974687_S3_H157.CEL.gz
Reading in : celfiles//GSM1974688_S3_H158.CEL.gz
Reading in : celfiles//GSM1974689_S3_H159.CEL.gz
Reading in : celfiles//GSM1974690_S3_H160.CEL.gz
Reading in : celfiles//GSM1974691_S3_H161.CEL.gz
Reading in : celfiles//GSM1974692_S3_H162.CEL.gz
Reading in : celfiles//GSM1974693_S3_H163.CEL.gz
Reading in : celfiles//GSM1974694_S3_H164C.CEL.gz
Reading in : celfiles//GSM1974695_S3_H165.CEL.gz
Reading in : celfiles//GSM1974696_S3_H166.CEL.gz
Reading in : celfiles//GSM1974697_S3_H167.CEL.gz
Reading in : celfiles//GSM1974698_S3_H168.CEL.gz
Reading in : celfiles//GSM1974699_S3_H170.CEL.gz
Reading in : celfiles//GSM1974700_S3_H171.CEL.gz
Reading in : celfiles//GSM1974701_S3_H172B.CEL.gz
Reading in : celfiles//GSM1974702_S3_H173.CEL.gz
Reading in : celfiles//GSM1974703_S3_H174.CEL.gz
Reading in : celfiles//GSM1974704_S3_H175B.CEL.gz
Reading in : celfiles//GSM1974705_S3_H176.CEL.gz
Reading in : celfiles//GSM1974706_S3_H177.CEL.gz
Reading in : celfiles//GSM1974707_S3_H178.CEL.gz
Reading in : celfiles//GSM1974708_S3_H179.CEL.gz
Reading in : celfiles//GSM1974709_S3_H180.CEL.gz
Reading in : celfiles//GSM1974710_S3_H181.CEL.gz
Reading in : celfiles//GSM1974711_S3_H182.CEL.gz
Reading in : celfiles//GSM1974712_S3_H183.CEL.gz
Reading in : celfiles//GSM1974713_S3_H184.CEL.gz
Reading in : celfiles//GSM1974714_S3_H185.CEL.gz
Reading in : celfiles//GSM1974715_S3_H186.CEL.gz
Reading in : celfiles//GSM1974716_S3_H187B.CEL.gz
Reading in : celfiles//GSM1974717_S3_H188.CEL.gz
Reading in : celfiles//GSM1974718_S3_H189.CEL.gz
Reading in : celfiles//GSM1974719_S3_H190.CEL.gz
Reading in : celfiles//GSM1974720_S3_H191B.CEL.gz
Reading in : celfiles//GSM1974721_S3_H193.CEL.gz
Reading in : celfiles//GSM1974722_S3_H194.CEL.gz
Reading in : celfiles//GSM1974723_S3_H196.CEL.gz
Reading in : celfiles//GSM1974724_S3_H197.CEL.gz
Reading in : celfiles//GSM1974725_S3_H198.CEL.gz
Reading in : celfiles//GSM1974726_S3_H199.CEL.gz
Reading in : celfiles//GSM1974727_S3_H200.CEL.gz
Reading in : celfiles//GSM1974728_S3_H201.CEL.gz
Reading in : celfiles//GSM1974729_S3_H202.CEL.gz
Reading in : celfiles//GSM1974730_S3_H203.CEL.gz
Reading in : celfiles//GSM1974731_S3_H204.CEL.gz
Reading in : celfiles//GSM1974732_S3_H205B.CEL.gz
Reading in : celfiles//GSM1974733_S3_H206B.CEL.gz
Reading in : celfiles//GSM1974734_S3_H207B.CEL.gz
Reading in : celfiles//GSM1974735_S3_H208B.CEL.gz
Reading in : celfiles//GSM1974736_S3_H209B.CEL.gz
Reading in : celfiles//GSM1974737_S3_H210B.CEL.gz
Reading in : celfiles//GSM1974738_S3_H211B.CEL.gz
Reading in : celfiles//GSM1974739_S3_H212.CEL.gz
Reading in : celfiles//GSM1974740_S3_H213.CEL.gz
Reading in : celfiles//GSM1974741_S3_H214.CEL.gz
Reading in : celfiles//GSM1974742_S3_H215.CEL.gz
Reading in : celfiles//GSM1974743_S3_H216.CEL.gz
Reading in : celfiles//GSM1974744_S3_H217.CEL.gz
Reading in : celfiles//GSM1974745_S3_H218.CEL.gz
Reading in : celfiles//GSM1974746_S3_H219.CEL.gz
Reading in : celfiles//GSM1974747_S3_H220.CEL.gz
Reading in : celfiles//GSM1974748_S3_H221.CEL.gz
Reading in : celfiles//GSM1974749_S3_H222.CEL.gz
Reading in : celfiles//GSM1974750_S3_H224.CEL.gz
Reading in : celfiles//GSM1974751_S3_H225B.CEL.gz
Reading in : celfiles//GSM1974752_S3_H226B.CEL.gz
Reading in : celfiles//GSM1974753_S3_H227B.CEL.gz
Reading in : celfiles//GSM1974754_S3_H228.CEL.gz
Reading in : celfiles//GSM1974755_S3_H229.CEL.gz
Reading in : celfiles//GSM1974756_S3_H230.CEL.gz
Reading in : celfiles//GSM1974757_S3_H93C.CEL.gz
Reading in : celfiles//GSM1974758_S3_H94C.CEL.gz
Reading in : celfiles//GSM1974759_S3_H95C.CEL.gz
Reading in : celfiles//GSM1974760_S3_H96C.CEL.gz
Reading in : celfiles//GSM1974761_S3_H97CC.CEL.gz
Reading in : celfiles//GSM1974762_S3_H98C.CEL.gz
Reading in : celfiles//GSM1974763_S3_H99C.CEL.gz
Reading in : celfiles//GSM1978883_S1_H11.CEL.gz
Reading in : celfiles//GSM1978884_S1_H12.CEL.gz
Reading in : celfiles//GSM1978885_S1_H13.CEL.gz
Reading in : celfiles//GSM1978886_S1_H15.CEL.gz
Reading in : celfiles//GSM1978887_S1_H16.CEL.gz
Reading in : celfiles//GSM1978888_S1_H17.CEL.gz
Reading in : celfiles//GSM1978889_S1_H18.CEL.gz
Reading in : celfiles//GSM1978890_S1_H1B.CEL.gz
Reading in : celfiles//GSM1978891_S1_H21.CEL.gz
Reading in : celfiles//GSM1978892_S1_H23.CEL.gz
Reading in : celfiles//GSM1978893_S1_H25.CEL.gz
Reading in : celfiles//GSM1978894_S1_H30.CEL.gz
Reading in : celfiles//GSM1978895_S1_H32.CEL.gz
Reading in : celfiles//GSM1978896_S1_H37.CEL.gz
Reading in : celfiles//GSM1978897_S1_H39.CEL.gz
Reading in : celfiles//GSM1978898_S1_H42.CEL.gz
Reading in : celfiles//GSM1978899_S1_H5.CEL.gz
Reading in : celfiles//GSM1978900_S3_H195.CEL.gz
Reading in : celfiles//GSM1978901_S3_H223.CEL.gz
Reading in : celfiles//GSM1978902_S4_H231.CEL.gz
Reading in : celfiles//GSM1978903_S4_H232.CEL.gz
Reading in : celfiles//GSM1978904_S4_H233.CEL.gz
Reading in : celfiles//GSM1978905_S4_H234.CEL.gz
Reading in : celfiles//GSM1978906_S4_H235.CEL.gz
Reading in : celfiles//GSM1978907_S4_H236.CEL.gz
Reading in : celfiles//GSM1978908_S4_H237.CEL.gz
Reading in : celfiles//GSM1978909_S4_H238.CEL.gz
Reading in : celfiles//GSM1978910_S4_H239.CEL.gz
Reading in : celfiles//GSM1978911_S4_H240.CEL.gz
Reading in : celfiles//GSM1978912_S4_H241.CEL.gz
Reading in : celfiles//GSM1978913_S4_H242.CEL.gz
Reading in : celfiles//GSM1978914_S4_H243.CEL.gz
Reading in : celfiles//GSM1978915_S4_H244.CEL.gz
Reading in : celfiles//GSM1978916_S4_H245.CEL.gz
Reading in : celfiles//GSM1978917_S4_H246.CEL.gz
Reading in : celfiles//GSM1978918_S4_H247.CEL.gz
Reading in : celfiles//GSM1978919_S4_H248.CEL.gz
Reading in : celfiles//GSM1978920_S4_H249.CEL.gz
Reading in : celfiles//GSM1978921_S4_H250.CEL.gz
Reading in : celfiles//GSM1978922_S4_H251.CEL.gz
Reading in : celfiles//GSM1978923_S4_H252.CEL.gz
Reading in : celfiles//GSM1978924_S4_H253.CEL.gz
Reading in : celfiles//GSM1978925_S4_H254.CEL.gz
Reading in : celfiles//GSM1978926_S4_H255.CEL.gz
Reading in : celfiles//GSM1978927_S4_H256.CEL.gz
Reading in : celfiles//GSM1978928_S4_H257B.CEL.gz
Reading in : celfiles//GSM1978929_S4_H258B.CEL.gz
Reading in : celfiles//GSM1978930_S4_H259.CEL.gz
Reading in : celfiles//GSM1978931_S4_H261.CEL.gz
Reading in : celfiles//GSM1978932_S4_H262.CEL.gz
Reading in : celfiles//GSM1978933_S4_H263.CEL.gz
Reading in : celfiles//GSM1978934_S4_H264.CEL.gz
Reading in : celfiles//GSM1978935_S4_H265.CEL.gz
Reading in : celfiles//GSM1978936_S4_H266.CEL.gz
Reading in : celfiles//GSM1978937_S4_H267.CEL.gz
Reading in : celfiles//GSM1978938_S4_H268.CEL.gz
Reading in : celfiles//GSM1978939_S4_H269.CEL.gz
Reading in : celfiles//GSM1978940_S4_H270.CEL.gz
Reading in : celfiles//GSM1978941_S4_H271.CEL.gz
Reading in : celfiles//GSM1978942_S4_H272.CEL.gz
Reading in : celfiles//GSM1978943_S4_H273.CEL.gz
Reading in : celfiles//GSM1978944_S4_H274.CEL.gz
Reading in : celfiles//GSM1978945_S4_H275B.CEL.gz
Reading in : celfiles//GSM1978946_S4_H276B.CEL.gz
Reading in : celfiles//GSM1978947_S4_H278.CEL.gz
Reading in : celfiles//GSM1978948_S4_H279B.CEL.gz
Reading in : celfiles//GSM1978949_S4_H280B.CEL.gz
Warning in read.celfiles(celFiles, phenoData = AnnotatedDataFrame(mdata_subset)) :
'channel' automatically added to varMetadata in phenoData.
# saveRDS(object = rawData, "rawData.RDS")
rawData <- readRDS("rawData.RDS")
Looking at the dimensions of the raw expression matrix.
exprs(rawData) %>% dim()
[1] 1354896 265
Looking at the number of samples for triple negative status and for set.
mdata_subset %>%
count(triple_negative_status, set)
mdata_subset %>%
count(triple_negative_status) %>%
mutate(proportion = round(n/sum(n), 3)) %>%
ggplot() +
geom_col(mapping = aes(y = "dummy_group",
x = proportion,
fill = triple_negative_status)) +
theme(axis.text.x = element_text(angle = 90, size = 7),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank(),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5),
legend.direction = "horizontal", legend.position = "top") +
geom_text(aes(y = 1,
x = proportion,
label = proportion,
group = triple_negative_status),
size = 5,
position = position_stack(vjust = 0.5),
color = "white") +
scale_fill_npg(name = "Triple Negative Status") +
labs(x = "Proportion",
title = str_wrap("Proportions of TNBC status values for GSE76275", 60))
ggsave(filename = "plots/exploration_plots/GSE76275_tnbc_proportion_barplot.png")
Saving 5.03 x 3.11 in image
mdata_subset %>%
count(triple_negative_status, set) %>%
mutate(proportion = round(n/sum(n), 3)) %>%
ggplot() +
geom_col(mapping = aes(x = triple_negative_status,
y = proportion,
fill = set)) +
geom_text(aes(x = triple_negative_status,
y = proportion,
label = proportion,
group = set),
size = 3,
position = position_stack(vjust = 0.5),
color = "white") +
theme(title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5),
legend.direction = "vertical", legend.position = "right") +
scale_fill_npg(name = "Sample Set") +
labs(x = "TNBC Status",
y = "Proportion",
title = str_wrap("Proportions of sample set values for GSE76275", 60))
ggsave(filename = "plots/exploration_plots/GSE76275_set_proportion_barplot.png")
Saving 5.03 x 3.11 in image
mdata_subset %>%
count(triple_negative_status, pr) %>%
mutate(proportion = round(n/sum(n), 3)) %>%
ggplot() +
geom_col(mapping = aes(x = pr,
y = proportion,
fill = triple_negative_status)) +
geom_text(aes(x = pr,
y = proportion,
label = proportion,
group = triple_negative_status),
size = 3,
position = position_stack(vjust = 0.5),
color = "white") +
theme(title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5),
legend.direction = "vertical", legend.position = "right") +
scale_fill_npg(name = "Triple Negative Status") +
labs(x = "Progesterone Receptor Status",
y = "Proportion",
title = str_wrap("Proportions of progesterone receptor status values for GSE76275", 60))
ggsave(filename = "plots/exploration_plots/GSE76275_pr_proportion_barplot.png")
Saving 5.03 x 3.11 in image
mdata_subset %>%
count(triple_negative_status, er) %>%
mutate(proportion = round(n/sum(n), 3)) %>%
ggplot() +
geom_col(mapping = aes(x = er,
y = proportion,
fill = triple_negative_status)) +
geom_text(aes(x = er,
y = proportion,
label = proportion,
group = triple_negative_status),
size = 3,
position = position_stack(vjust = 0.5),
color = "white") +
theme(title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5),
legend.direction = "vertical", legend.position = "right") +
scale_fill_npg(name = "Triple Negative Status") +
labs(x = "Estrogen Receptor Status",
y = "Proportion",
title = str_wrap("Proportions of estrogen receptor status values for GSE76275", 60))
ggsave(filename = "plots/exploration_plots/GSE76275_er_proportion_barplot.png")
Saving 5.03 x 3.11 in image
mdata_subset %>%
count(triple_negative_status, her2) %>%
mutate(proportion = round(n/sum(n), 3)) %>%
ggplot() +
geom_col(mapping = aes(x = her2,
y = proportion,
fill = triple_negative_status)) +
geom_text(aes(x = her2,
y = proportion,
label = proportion,
group = triple_negative_status),
size = 3,
position = position_stack(vjust = 0.5),
color = "white") +
theme(title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5),
legend.direction = "vertical", legend.position = "right") +
scale_fill_npg(name = "Triple Negative Status") +
labs(x = "HER2 Amplification Status",
y = "Proportion",
title = str_wrap("Proportions of HER2 amplification status values for GSE76275", 60))
ggsave(filename = "plots/exploration_plots/GSE76275_her2_proportion_barplot.png")
Saving 5.03 x 3.11 in image
mdata_subset %>%
ggplot() +
geom_boxplot(aes(x = triple_negative_status, y = age_years, fill = set)) +
scale_fill_npg(name = "Sample Set") +
theme_light() +
labs(x = "Triple Negative Status",
y = "Age in Years",
title = str_wrap("Age distribution of sample sets for GSE76275", 60))
Warning: Removed 9 rows containing non-finite values (stat_boxplot).
ggsave("plots/exploration_plots/GSE76275_age_boxplot.png")
Saving 5.03 x 3.11 in image
Warning: Removed 9 rows containing non-finite values (stat_boxplot).
list("ER" = mdata_subset$geo_accession[mdata_subset$er == "Positive"],
"PR" = mdata_subset$geo_accession[mdata_subset$pr == "Positive"],
"HER2" = mdata_subset$geo_accession[mdata_subset$her2 == "Positive"]) %>%
fromList(.) %>%
upset(., c("ER", "PR", "HER2"), width_ratio = 0.2) +
ggtitle(str_wrap("Upset plot for different combinations of ER, PR, and HER2 status in the non-TNBC samples in GSE76275", 40)) +
theme(title = element_text(size = 10))
ggsave("plots/exploration_plots/GSE76275_nonTNBC_upset.png")
Saving 5.03 x 3.11 in image
res_1 <- rma(rawData)
Loading required package: RSQLite
Loading required package: DBI
Background correcting
Normalizing
Calculating Expression
# saveRDS(object = res_1, "res_1.RDS")
res_1 <- readRDS("res_1.RDS")
exprs(res_1) %>%
dim()
[1] 54675 265
exprs(res_1)[1:5, 1:5]
GSM1974566 GSM1974567 GSM1974568 GSM1974569
1007_s_at 10.754163 11.361803 9.690693 10.157010
1053_at 8.625663 9.011796 7.854072 7.925281
117_at 7.333973 7.385199 7.466878 8.048563
121_at 9.077887 8.782080 8.885189 8.775218
1255_g_at 4.656331 4.635625 4.536114 4.626950
GSM1974570
1007_s_at 10.597699
1053_at 8.456781
117_at 7.581895
121_at 8.752407
1255_g_at 5.454820
Getting lists of the TNBC samples and the non-TNBC samples.
tnbc_samples <- mdata_subset %>%
filter(triple_negative_status == "TN") %>%
select(geo_accession) %>%
unlist(use.names = F) %>%
as.character()
head(tnbc_samples)
[1] "GSM1974566" "GSM1974567" "GSM1974568" "GSM1974569"
[5] "GSM1974570" "GSM1974571"
nontnbc_samples <- mdata_subset %>%
filter(triple_negative_status == "not TN") %>%
select(geo_accession) %>%
unlist(use.names = F) %>%
as.character()
head(nontnbc_samples)
[1] "GSM1978883" "GSM1978884" "GSM1978885" "GSM1978886"
[5] "GSM1978887" "GSM1978888"
Creating different metadata tables for TNBC and nonTNBC.
mdata_subset_tnbc <- mdata_subset[tnbc_samples, ]
dim(mdata_subset_tnbc)
[1] 198 12
mdata_subset_nontnbc <- mdata_subset[nontnbc_samples, ]
dim(mdata_subset_nontnbc)
[1] 67 12
Reading in the TNBC files.
# rawData_tnbc <- read.celfiles(filenames = celFiles[tnbc_samples],
# phenoData = AnnotatedDataFrame(mdata_subset_tnbc))
#
# rawData_tnbc
# saveRDS(rawData_tnbc, file = "rawData_tnbc.RDS")
rawData_tnbc <- readRDS(file = "rawData_tnbc.RDS")
rawData_tnbc
ExpressionFeatureSet (storageMode: lockedEnvironment)
assayData: 1354896 features, 198 samples
element names: exprs
protocolData
rowNames: GSM1974566 GSM1974567 ... GSM1974763 (198 total)
varLabels: exprs dates
varMetadata: labelDescription channel
phenoData
rowNames: GSM1974566 GSM1974567 ... GSM1974763 (198 total)
varLabels: geo_accession title ... age_years (11 total)
varMetadata: labelDescription channel
featureData: none
experimentData: use 'experimentData(object)'
Annotation: pd.hg.u133.plus.2
Loading required package: pd.hg.u133.plus.2
Loading required package: RSQLite
Loading required package: DBI
Reading in the nonTNBC files.
# rawData_nontnbc <- read.celfiles(filenames = celFiles[nontnbc_samples],
# phenoData = AnnotatedDataFrame(mdata_subset_nontnbc))
#
# rawData_nontnbc
# saveRDS(rawData_nontnbc, file = "rawData_nontnbc.RDS")
rawData_nontnbc <- readRDS(file = "rawData_nontnbc.RDS")
rawData_nontnbc
ExpressionFeatureSet (storageMode: lockedEnvironment)
assayData: 1354896 features, 67 samples
element names: exprs
protocolData
rowNames: GSM1978883 GSM1978884 ... GSM1978949 (67
total)
varLabels: exprs dates
varMetadata: labelDescription channel
phenoData
rowNames: GSM1978883 GSM1978884 ... GSM1978949 (67
total)
varLabels: geo_accession title ... age_years (11
total)
varMetadata: labelDescription channel
featureData: none
experimentData: use 'experimentData(object)'
Annotation: pd.hg.u133.plus.2
Performing RMA on TNBC data.
# res_tnbc <- rma(rawData_tnbc)
# saveRDS(res_tnbc, file = "res_tnbc.RDS")
res_tnbc <- readRDS(file = "res_tnbc.RDS")
Performing RMA on nonTNBC data.
# res_nontnbc <- rma(rawData_nontnbc)
# saveRDS(res_nontnbc, file = "res_nontnbc.RDS")
res_nontnbc <- readRDS(file = "res_nontnbc.RDS")
Combining the expression matrices of TNBC and nonTNBC data after separate RMA.
res_joint <- cbind(exprs(res_tnbc), exprs(res_nontnbc))
res_joint[1:5, 1:5]
GSM1974566 GSM1974567 GSM1974568 GSM1974569
1007_s_at 10.703231 11.345325 9.732361 10.175704
1053_at 8.605269 9.019553 7.794234 7.945029
117_at 7.360884 7.373951 7.481344 8.047586
121_at 9.100729 8.776369 8.860402 8.755606
1255_g_at 4.664492 4.628692 4.569530 4.627230
GSM1974570
1007_s_at 10.576463
1053_at 8.497373
117_at 7.530427
121_at 8.766560
1255_g_at 5.467292
Saving the joint expression matrix from class-specific QN.
res_joint %>%
as_tibble(rownames = "probe_id") %>%
write_csv("dataframe_files/post_classQN_expression.csv")
Error: Cannot open file for writing:
* 'dataframe_files/post_classQN_expression.csv'
Saving a subset of the metadata that I think is relevant.
mdata_subset %>%
write_csv("dataframe_files/metadata_subset.csv")
Saving the TNBC and nonTNBC metadata separately, just in case.
mdata_subset_tnbc %>%
write_csv("dataframe_files/metadata_subset_tnbc.csv")
mdata_subset_nontnbc %>%
write_csv("dataframe_files/metadata_subset_nontnbc.csv")
res_1_df_long <- res_1 %>%
exprs() %>%
as_tibble(rownames = "probeID") %>%
pivot_longer(cols = all_of(c(tnbc_samples, nontnbc_samples)), names_to = "sample_id",
values_to = "intensity") %>%
left_join(., mdata_subset, by = c("sample_id" = "geo_accession"))
# saveRDS(object = res_1_df_long, "res_1_df_long.RDS")
res_1_df_long <- readRDS("res_1_df_long.RDS")
p2 <- res_1_df_long %>%
ggplot() +
geom_boxplot(mapping = aes(x = reorder(sample_id, as.numeric(set)), y = intensity, color = set), outlier.size = 0.2)
p2 <- p2 + labs(x = "samples",
title = str_wrap("Sample-wise log2 intensity boxplots for global quantile normalization for GSE76275", 60)) +
scale_color_npg(name = "Triple Negative Status") +
theme_light() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
title = element_text(size = 10),
legend.position = "top",
legend.direction = "horizontal")
p2
NA
ggsave("plots/exploration_plots/GSE76275_post_regQN_boxplots.png",
p2,
units = "cm",
width = 30,
height = 10)
rm(res_1_df_long)
res_joint_df_long <- res_joint %>%
as_tibble(rownames = "probeID") %>%
pivot_longer(cols = all_of(c(tnbc_samples, nontnbc_samples)), names_to = "sample_id",
values_to = "intensity")
Error in as_tibble(., rownames = "probeID") :
object 'res_joint' not found
saveRDS(object = res_joint_df_long, "res_joint_df_long.RDS")
Error in saveRDS(object = res_joint_df_long, "res_joint_df_long.RDS") :
object 'res_joint_df_long' not found
p1 <- res_joint_df_long %>%
left_join(., mdata_subset, by = c("sample_id" = "geo_accession")) %>%
mutate(sample_id = factor(sample_id)) %>%
ggplot() +
geom_boxplot(mapping = aes(x = reorder(sample_id, as.numeric(set)), y = intensity, color = set), outlier.size = 0.2)
p1 <- p1 + labs(x = "samples",
title = str_wrap("Sample-wise log2 intensity boxplots for class-specific quantile normalization for GSE76275", 60)) +
scale_color_npg(name = "Triple Negative Status") +
theme_light() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
title = element_text(size = 10),
legend.position = "top",
legend.direction = "horizontal")
p1
ggsave("plots/exploration_plots/GSE76275_post_classQN_boxplots.png",
p1,
units = "cm", width = 20, height = 10)
res_joint_df_long %>%
left_join(., mdata_subset, by = c("sample_id" = "geo_accession")) %>%
mutate(sample_id = factor(sample_id)) %>%
ggplot() +
geom_boxplot(mapping = aes(x = reorder(sample_id, as.numeric(set)), y = intensity, fill = set), outlier.size = 0.2) +
labs(x = "samples",
title = str_wrap("Sample-wise log2 intensity boxplots for class-specific quantile normalization for GSE76275", 60)) +
scale_fill_npg(name = "Triple Negative Status") +
theme_light() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
title = element_text(size = 10),
legend.position = "right",
legend.direction = "vertical")
ggsave("plots/exploration_plots/GSE76275_post_classQN_boxplots_bad.png", units = "cm", width = 20, height = 10)
Function to create an annotated data frame by combining PC scores as well as metadata: useful for ggplot visualization.
get_pca_annot_df <- function(pca.obj, sample_id_col, mdata_df){
ind_scores <- pca.obj$x
ind_scores_reordered <- ind_scores[match(rownames(ind_scores), mdata_df[[sample_id_col]]), ] %>%
as_tibble(rownames = sample_id_col) %>%
mutate(filename = factor(!!sym(sample_id_col)))
ind_scores_annot <- left_join(ind_scores_reordered, y = mdata_df, by = sample_id_col) %>%
select(all_of(colnames(mdata_subset)), contains("PC"))
return(ind_scores_annot)
}
# pca.res_1 <- res_1 %>%
# exprs() %>%
# t() %>%
# prcomp(center = TRUE, scale = TRUE)
# saveRDS(pca.res_1, "pca_res1.RDS")
pca.res_1 <- readRDS("pca_res1.RDS")
Getting the annotated data frame for the PCA.
pca.res_1.annot_df <- get_pca_annot_df(pca.obj = pca.res_1, sample_id_col = "geo_accession", mdata_df= mdata_subset)
head(pca.res_1.annot_df)
Looking at the variance explained by the first 10 PCs.
fviz_eig(pca.res_1) +
labs(x = "Principal Component",
title = str_wrap("Scree plot for the first 10 principal components for global RMA-normalized data for GSE76275", 60)) +
theme(title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5))
ggsave("plots/exploration_plots/PCA_wholeQN_scree.png", bg = "white")
Saving 5.03 x 3.11 in image
Superimposing variables in data upon sample PCA scores. The PCA does not seem to separate the TNBC and nonTNBC samples that well when regular RMA is performed.
ggplot(pca.res_1.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = triple_negative_status)) +
ggtitle(str_wrap("Samples in first two PCs, coloured by triple negative status for GSE76275 after global quantile normalization", 60)) +
scale_color_npg(name = "Triple Negative Status") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5))
ggsave("plots/exploration_plots/PCA_wholeQN_TNBC_status.png", bg = "white")
Saving 5.03 x 3.11 in image
NA
The samples do not seem to separate well by set either.
ggplot(pca.res_1.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = set)) +
ggtitle(str_wrap("Samples in first two PCs, coloured by set (discovery or validation) for GSE76275 after global quantile normalization", 60)) +
scale_color_aaas(name = "Sample Set") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5))
ggsave("plots/exploration_plots/PCA_wholeQN_set.png", bg = "white")
Saving 5.03 x 3.11 in image
There does not seem to be too strong of a batch effect according to submission date.
ggplot(pca.res_1.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = submission_date)) +
ggtitle("Samples in first two PCs, \ncoloured by submission date for whole QN") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5))
ggsave("plots/exploration_plots/PCA_wholeQN_set.png")
ggplot(pca.res_1.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = tnbc_subtype)) +
ggtitle("Samples in first two PCs, \ncoloured by tnbc_subtype for whole QN") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5), legend.position = "right", legend.direction = "vertical", legend.key.width = unit(x = 0.5, units = "cm"))
pca.res_joint <- res_joint %>%
t() %>%
prcomp(center = TRUE, scale = TRUE)
# saveRDS(pca.res_joint, "pca_res_joint.RDS")
pca.res_joint <- readRDS("pca_res_joint.RDS")
Getting the annotated data frame for the PCA.
pca.res_joint.annot_df <- get_pca_annot_df(pca.obj = pca.res_joint, sample_id_col = "geo_accession", mdata_df= mdata_subset)
head(pca.res_joint.annot_df)
Looking at the variance explained by the first 10 PCs.
fviz_eig(pca.res_joint) +
labs(x = "Principal Component",
title = str_wrap("Scree plot for the first 10 principal components for class-specific RMA-normalized data for GSE76275", 60)) +
theme(title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5))
ggsave("plots/exploration_plots/PCA_classQN_scree.png")
Saving 5.03 x 3.11 in image
Superimposing variables in data upon sample PCA scores. The PCA does separate the TNBC and nonTNBC samples well when class-specific RMA is performed.
ggplot(pca.res_joint.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = triple_negative_status)) +
ggtitle(str_wrap("Samples in first two PCs, coloured by triple negative status for GSE76275 after class-specific quantile normalization", 60)) +
scale_color_npg(name = "Triple Negative Status") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5))
ggsave("plots/exploration_plots/PCA_classQN_TNBC_status.png")
Saving 5.03 x 3.11 in image
The validation nonTNBC samples are separated from the discovery TNBC and validation TNBC samples.
ggplot(pca.res_joint.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = set)) +
ggtitle(str_wrap("Samples in first two PCs, coloured by set (discovery or validation) for GSE76275 after class-specific quantile normalization", 60)) +
scale_color_aaas(name = "Sample Set") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5))
ggsave("plots/exploration_plots/PCA_classQN_TNBC_set.png")
Saving 5.03 x 3.11 in image
Submission date is perfectly confounded with TNBC status. May or may not be batch effects.
ggplot(pca.res_joint.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = submission_date)) +
ggtitle(str_wrap("Samples in first two PCs, coloured by submission date for GSE76275 after class-specific quantile normalization", 60)) +
scale_color_npg(name = "Submission Date") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5))
ggsave("plots/exploration_plots/PCA_classQN_TNBC_date.png")
Saving 5.03 x 3.11 in image
ggplot(pca.res_joint.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = tnbc_subtype)) +
ggtitle(str_wrap("Samples in first two PCs, coloured by subtype of TNBC for GSE76275 after class-specific quantile normalization", 60)) +
scale_color_nejm(name = "TNBC Subtype") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5),
legend.text = element_text(size = 5), legend.key.height = unit(x = 0.3, units = "cm"), legend.key.width = unit(x = 0.3, units = "cm")) +
guides(color = guide_legend(override.aes = list(size = 1)))
ggsave("plots/exploration_plots/PCA_classQN_TNBC_subtype.png")
Saving 5.03 x 3.11 in image
ggplot(pca.res_joint.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2,
colour = factor(pr), shape = triple_negative_status)) +
ggtitle(str_wrap("Samples in first two PCs, coloured by progesterone receptor status for GSE76275 after class-specific quantile normalization", 60)) +
scale_color_npg(name = "Progesterone Receptor Status") +
scale_shape_manual(name = "Triple Negative Status", values = c("TN" = 17, "not TN" = 1)) +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5),
legend.title = element_text(size = 7),
legend.text = element_text(size = 5),
legend.key.height = unit(x = 0.3, units = "cm"),
legend.key.width = unit(x = 0.3, units = "cm")) +
guides(color = guide_legend(override.aes = list(size = 1)))
ggsave("plots/exploration_plots/PCA_classQN_pr_GSE76275.png", bg= "white")
Saving 5.03 x 3.11 in image
ggplot(pca.res_joint.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2,
colour = factor(her2), shape = triple_negative_status)) +
ggtitle(str_wrap("Samples in first two PCs, coloured by HER2 amplification status for GSE76275 after class-specific quantile normalization", 60)) +
scale_color_nejm(name = "HER2 Amplification Status") +
scale_shape_manual(name = "Triple Negative Status", values = c("TN" = 17, "not TN" = 1)) +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5),
legend.title = element_text(size = 7),
legend.text = element_text(size = 5),
legend.key.height = unit(x = 0.3, units = "cm"),
legend.key.width = unit(x = 0.3, units = "cm")) +
guides(color = guide_legend(override.aes = list(size = 1)))
ggsave("plots/exploration_plots/PCA_classQN_her2_GSE76275.png", bg = "white")
Saving 5.03 x 3.11 in image
ggplot(pca.res_joint.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2,
colour = factor(er), shape = triple_negative_status)) +
ggtitle(str_wrap("Samples in first two PCs, coloured by estrogen receptor status for GSE76275 after class-specific quantile normalization", 60)) +
scale_color_npg(name = "Estrogen Receptor Status") +
scale_shape_manual(name = "Triple Negative Status", values = c("TN" = 17, "not TN" = 1)) +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5),
legend.title = element_text(size = 7),
legend.text = element_text(size = 5),
legend.key.height = unit(x = 0.3, units = "cm"),
legend.key.width = unit(x = 0.3, units = "cm")) +
guides(color = guide_legend(override.aes = list(size = 1)))
ggsave("plots/exploration_plots/PCA_classQN_er_GSE76275.png")
Saving 5.03 x 3.11 in image
perform_min_max <- function(x){
mm_transformation <- preProcess(x, method = "range")
rescaled <- predict(mm_transformation, x)
return(rescaled)
}
Getting distances after performing min max normalization.
# res_1_dists <- exprs(res_1) %>%
# t() %>%
# perform_min_max() %>%
# dist(method = "euclidean")
# saveRDS(res_1_dists, "res_1_dists.RDS")
res_1_dists <- readRDS("res_1_dists.RDS")
# res_joint_dists <- res_joint %>%
# t() %>%
# perform_min_max() %>%
# dist(method = "euclidean")
# saveRDS(res_joint_dists, "res_joint_dists.RDS")
res_joint_dists <- readRDS("res_joint_dists.RDS")
res_1_dend <- res_1_dists %>%
hclust() %>%
as.dendrogram()
res_joint_dend <- res_joint_dists %>%
hclust() %>%
as.dendrogram()
library(dendextend)
# res_1_dend %>%
# labels()
# res_1_dend %>%
# order.dendrogram()
# (res_1 %>%
# exprs() %>%
# colnames())[28]
res_1_dend_laborder <- res_1_dend %>%
labels()
mycolors <- ifelse(mdata_subset[res_1_dend_laborder, ]$triple_negative_status == "TN", "forestgreen", "maroon")
par(mar = c(10,2,1,1))
res_1_dend %>%
set("labels_cex", 0.1) %>%
plot()
colored_bars(colors = mycolors, dend = res_1_dend, rowLabels = "TN Status", add = TRUE)
res_joint_dend_laborder <- res_joint_dend %>%
labels()
mycolors <- ifelse(mdata_subset[res_joint_dend_laborder, ]$triple_negative_status == "TN", "forestgreen", "maroon")
par(mar = c(10,2,1,1))
res_joint_dend %>%
set("labels_cex", 0.1) %>%
plot()
colored_bars(colors = mycolors, dend = res_joint_dend, rowLabels = "TN Status", add = TRUE)
Function to process distance object into a distance matrix for heatmap visualization.
get_distmat <- function(x){
distmat <- as.matrix(x)
colnames(distmat) <- NULL
diag(distmat) <- NA
return(distmat)
}
row_annot <- mdata_subset %>%
mutate(er = factor(er), pr = factor(pr), her2 = factor(her2)) %>%
select(set, submission_date, pr, er, her2) %>%
rename(`Sample set` = set,
`Submission date` = submission_date,
`HER2 status` = her2,
`ER status` = er,
`PR status` = pr)
head(row_annot)
set.seed(5)
row_colours <- list( "Sample set" = c("darkgoldenrod4", "darkmagenta", "salmon"),
"Submission date" = pal_nejm()(2),
"PR status" = pal_lancet()(9)[1:2],
"ER status" = pal_lancet()(9)[8:9],
"HER2 status" = pal_npg()(9)[c(3, 2, 8)])
names(row_colours[["Sample set"]]) <- as.character(unique(row_annot[["Sample set"]]))
names(row_colours[["PR status"]]) <- unique(row_annot[["PR status"]])
names(row_colours[["ER status"]]) <- unique(row_annot[["ER status"]])
names(row_colours[["HER2 status"]]) <- unique(row_annot[["HER2 status"]])
names(row_colours[["Submission date"]]) <- as.character(unique(row_annot[["Submission date"]]))
str(row_colours)
List of 5
$ Sample set : Named chr [1:3] "darkgoldenrod4" "darkmagenta" "salmon"
..- attr(*, "names")= chr [1:3] "Validation TN" "Discovery TN" "Validation not TN"
$ Submission date: Named chr [1:2] "#BC3C29FF" "#0072B5FF"
..- attr(*, "names")= chr [1:2] "Dec 17 2015" "Dec 22 2015"
$ PR status : Named chr [1:2] "#00468BFF" "#ED0000FF"
..- attr(*, "names")= chr [1:2] "Negative" "Positive"
$ ER status : Named chr [1:2] "#ADB6B6FF" "#1B1919FF"
..- attr(*, "names")= chr [1:2] "Negative" "Positive"
$ HER2 status : Named chr [1:3] "#00A087FF" "#4DBBD5FF" "#DC0000FF"
..- attr(*, "names")= chr [1:3] "Negative" "Positive" "Not Available"
my_colours <- viridis(265^2, begin = 1, end = 0)
res_joint_dists %>%
get_distmat() %>%
pheatmap(.,
color = my_colours,
annotation_row = row_annot,
annotation_colors = row_colours,
show_colnames = F,
show_rownames = F,
cutree_rows = 2,
cutree_cols = 2,
main = str_wrap("Heatmap of sample distances for class-specific QN expression matrix for GSE76275", 60),
legend_labels = c("small distance", "large distance"),
legend_breaks = c(min(., na.rm = TRUE),
max(., na.rm = TRUE)),
filename = "plots/exploration_plots/classQN_clustering_heatmap_GSE76275.png")
full_mod <- mdata_subset %>%
select(geo_accession, triple_negative_status) %>%
arrange(triple_negative_status) %>%
model.matrix(~triple_negative_status, data = .)
head(full_mod)
(Intercept) triple_negative_statusTN
GSM1978883 1 0
GSM1978884 1 0
GSM1978885 1 0
GSM1978886 1 0
GSM1978887 1 0
GSM1978888 1 0
red_mod <- model.matrix(~1, data = mdata_subset)
head(red_mod)
(Intercept)
GSM1974566 1
GSM1974567 1
GSM1974568 1
GSM1974569 1
GSM1974570 1
GSM1974571 1
Get number of significant surrogate variables.
n.sv.wholeQN <- num.sv(exprs(res_1), full_mod, method="leek")
n.sv.wholeQN
[1] 0
svobj.wholeQN <- sva(exprs(res_1), mod = full_mod, mod0 = red_mod, n.sv = 1)
sv_df.wholeQN <- tibble("geo_accession" = colnames(exprs(res_1)), "sv" = svobj.wholeQN$sv)
head(sv_df.wholeQN)
left_join(sv_df.wholeQN, mdata, by = "geo_accession") %>%
mutate(index = 5) %>%
ggplot() +
# geom_col(mapping = aes(y = fct_reorder(geo_accession, sv, .fun = function(x){x}), x = sv, fill = set)) +
geom_boxplot(mapping = aes(x = submission_date, y = sv, fill = set)) +
theme_light() +
labs(y = "Surrogate Variable Value", title = "Distribution of latent variable estimated by SVA for different grouping factors")
# ggsave("plots/exploration_plots/sva_grouping_normalRMA.png")
Create full model matrix.
full_mod <- mdata_subset %>%
select(geo_accession, triple_negative_status) %>%
arrange(triple_negative_status) %>%
model.matrix(~triple_negative_status, data = .)
head(full_mod)
(Intercept) triple_negative_statusTN
GSM1978883 1 0
GSM1978884 1 0
GSM1978885 1 0
GSM1978886 1 0
GSM1978887 1 0
GSM1978888 1 0
Create reduced model matrix.
red_mod <- model.matrix(~1, data = mdata_subset)
head(red_mod)
(Intercept)
GSM1974566 1
GSM1974567 1
GSM1974568 1
GSM1974569 1
GSM1974570 1
GSM1974571 1
Get number of significant surrogate variables.
n.sv.classQN <- num.sv(res_joint, full_mod, method="leek")
n.sv.classQN
[1] 1
Perform SVA on classQN-normalized expression matrix.
svobj.classQN <- sva(res_joint, mod = full_mod, mod0 = red_mod, n.sv = n.sv.classQN)
Number of significant surrogate variables is: 1
Iteration (out of 5 ):1 2 3 4 5
sv_df.classQN <- tibble("geo_accession" = colnames(res_joint), "sv" = svobj.classQN$sv)
head(sv_df.classQN)
saveRDS(sv_df.classQN, "sv_df_classQN.RDS")
# sv_df.classQN <- readRDS("sv_df_classQN.RDS")
sv_df.classQN %>%
ggplot() +
geom_boxplot(mapping = aes(x = submission_date, y = sv, fill = er)) +
# geom_point(mapping = aes(x = submission_date, y = sv, color = er)) +
theme_light()
# labs(y = "Surrogate Variable Value", title = "Distribution of latent variable estimated by SVA for different grouping factors")
# ggsave("plots/exploration_plots/sva_grouping_classQN.png")
In this attempt, I perform no quantile normalization while performing RMA. If QN has not been performed and a surrogate variable shows up that corresponds to batch, batch effects are probably present.
rawData.summary <- rma(rawData, background = TRUE, normalize = FALSE)
Loading required package: RSQLite
Loading required package: DBI
Background correcting
Calculating Expression
rawData.summary_df_long <- rawData.summary %>%
exprs() %>%
as_tibble(rownames = "probeID") %>%
pivot_longer(cols = all_of(c(tnbc_samples, nontnbc_samples)), names_to = "sample_id",
values_to = "intensity") %>%
left_join(., mdata_subset, by = c("sample_id" = "geo_accession"))
p3 <- rawData.summary_df_long %>%
ggplot() +
geom_boxplot(mapping = aes(x = reorder(sample_id, as.numeric(set)), y = intensity,
color = set))
p3 <- p3 + labs(x = "samples",
title = str_wrap("Sample-wise log2 intensity boxplots for non- quantile normalized expression values for GSE76275", 60)) +
scale_color_npg(name = "Triple Negative Status") +
theme_light() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
title = element_text(size = 10),
legend.position = "top",
legend.direction = "horizontal")
p3
ggsave("plots/exploration_plots/GSE76275_noQN_boxplots.png",
p3,
units = "cm", width = 30, height = 10)
Create full model matrix.
full_mod <- mdata_subset %>%
select(geo_accession, triple_negative_status) %>%
arrange(triple_negative_status) %>%
model.matrix(~triple_negative_status, data = .)
head(full_mod)
(Intercept) triple_negative_statusTN
GSM1978883 1 0
GSM1978884 1 0
GSM1978885 1 0
GSM1978886 1 0
GSM1978887 1 0
GSM1978888 1 0
Create reduced model matrix.
red_mod <- model.matrix(~1, data = mdata_subset)
head(red_mod)
(Intercept)
GSM1974566 1
GSM1974567 1
GSM1974568 1
GSM1974569 1
GSM1974570 1
GSM1974571 1
Getting the number of surrogate variables in the absence of quantile normalization.
n.sv.nonorm <- num.sv(exprs(rawData.summary), full_mod, method="leek")
There is one surrogate variable present in the absence of QN.
n.sv.nonorm
[1] 1
svobj.nonorm <- sva(exprs(rawData.summary), mod = full_mod, mod0 = red_mod, n.sv = n.sv.nonorm)
Number of significant surrogate variables is: 1
Iteration (out of 5 ):1 2 3 4 5
sv_df.nonorm <- tibble("geo_accession" = colnames(exprs(rawData.summary)), "sv" = svobj.nonorm$sv)
head(sv_df.nonorm)
left_join(sv_df.nonorm, mdata_subset, by = "geo_accession") %>%
mutate(index = 5) %>%
ggplot() +
# geom_col(mapping = aes(y = fct_reorder(geo_accession, sv, .fun = function(x){x}), x = sv, fill = set)) +
geom_boxplot(mapping = aes(x = submission_date, y = sv, fill = set)) +
scale_fill_npg(name = "Sample Set") +
theme_light() +
labs(x = "Submission Date",
y = "Surrogate Variable Value",
title = str_wrap("Distribution of latent variable estimated by SVA for submission dates, coloured by sample set", 50))
ggsave("plots/exploration_plots/sva_grouping_date_noQN.png")
Saving 5.03 x 3.11 in image
left_join(sv_df.nonorm, mdata_subset, by = "geo_accession") %>%
mutate(index = 5) %>%
ggplot() +
geom_boxplot(mapping = aes(x = pr, y = sv, fill = set)) +
scale_fill_npg(name = "Sample Set") +
theme_light() +
labs(x = "Progesterone Receptor Status",
y = "Surrogate Variable Value",
title = str_wrap("Distribution of latent variable estimated by SVA for progesterone receptor status, coloured by sample set", 50))
ggsave("plots/exploration_plots/sva_grouping_pr_noQN.png")
Saving 5.03 x 3.11 in image
left_join(sv_df.nonorm, mdata_subset, by = "geo_accession") %>%
mutate(index = 5) %>%
ggplot() +
geom_boxplot(mapping = aes(x = er, y = sv, fill = set)) +
scale_fill_npg(name = "Sample Set") +
theme_light() +
labs(x = "Estrogen Receptor Status",
y = "Surrogate Variable Value",
title = str_wrap("Distribution of latent variable estimated by SVA for estrogen receptor status, coloured by sample set", 50))
ggsave("plots/exploration_plots/sva_grouping_er_noQN.png")
Saving 5.03 x 3.11 in image
left_join(sv_df.nonorm, mdata_subset, by = "geo_accession") %>%
mutate(index = 5) %>%
ggplot() +
geom_boxplot(mapping = aes(x = her2, y = sv, fill = set)) +
scale_fill_npg(name = "Sample Set") +
theme_light() +
labs(x = "HER2 Amplification Status",
y = "Surrogate Variable Value",
title = str_wrap("Distribution of latent variable estimated by SVA for HER2 amplification status, coloured by sample set", 50))
ggsave("plots/exploration_plots/sva_grouping_her2_noQN.png")
Saving 5.03 x 3.11 in image
left_join(sv_df.nonorm, mdata_subset, by = "geo_accession") %>%
mutate(index = 5) %>%
ggplot() +
geom_point(mapping = aes(x = as.numeric(age_years), y = sv, color = set)) +
scale_color_npg(name = "Sample set") +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5),
legend.title = element_text(size = 7),
legend.text = element_text(size = 5),
legend.key.height = unit(x = 0.3, units = "cm"),
legend.key.width = unit(x = 0.3, units = "cm")) +
guides(color = guide_legend(override.aes = list(size = 1))) +
labs(x = "Age in Years",
y = "Surrogate Variable Value",
title = str_wrap("Scatterplot of age latent variable estimated by SVA versus age", 50))
Warning: Removed 9 rows containing missing values (geom_point).
ggsave("plots/exploration_plots/sva_grouping_age_noQN.png")
Saving 5.03 x 3.11 in image
Warning: Removed 9 rows containing missing values (geom_point).